In [ ]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
In [5]:
a = pd.read_csv(r'C:/Users/Brian/Desktop/baseball_logistic/batting.csv')
In [7]:
m = pd.read_csv(r'C:/Users/Brian/Desktop/baseball_logistic/Master.csv')
In [6]:
a.head()
Out[6]:
playerID yearID stint teamID lgID G AB R H 2B ... RBI SB CS BB SO IBB HBP SH SF GIDP
0 abercda01 1871 1 TRO NaN 1 4 0 0 0 ... 0.0 0.0 0.0 0 0.0 NaN NaN NaN NaN NaN
1 addybo01 1871 1 RC1 NaN 25 118 30 32 6 ... 13.0 8.0 1.0 4 0.0 NaN NaN NaN NaN NaN
2 allisar01 1871 1 CL1 NaN 29 137 28 40 4 ... 19.0 3.0 1.0 2 5.0 NaN NaN NaN NaN NaN
3 allisdo01 1871 1 WS3 NaN 27 133 28 44 10 ... 27.0 1.0 1.0 0 2.0 NaN NaN NaN NaN NaN
4 ansonca01 1871 1 RC1 NaN 25 120 29 39 11 ... 16.0 6.0 2.0 2 1.0 NaN NaN NaN NaN NaN

5 rows × 22 columns

In [13]:
a.describe()
Out[13]:
yearID stint G AB R H 2B 3B HR RBI SB CS BB SO IBB HBP SH SF GIDP
count 101332.000000 101332.000000 101332.000000 96183.000000 96183.000000 96183.000000 96183.000000 96183.000000 96183.000000 95759.000000 94883.000000 72729.000000 96183.000000 88345.000000 59620.000000 93373.000000 89845.000000 60151.000000 70075.000000
mean 1963.506533 1.077567 51.400111 149.970327 19.887038 39.261647 6.637067 1.373361 2.949305 17.965163 3.158184 1.324025 13.811484 21.629849 1.213234 1.113395 2.457900 1.150122 3.210032
std 38.628278 0.283676 47.145273 186.557072 28.671365 53.310941 9.801563 2.710547 6.409662 26.756514 7.922994 2.838196 21.092775 28.432978 2.894918 2.320660 4.347818 2.023981 4.835881
min 1871.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1933.000000 1.000000 13.000000 7.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 1972.000000 1.000000 34.000000 57.000000 5.000000 11.000000 1.000000 0.000000 0.000000 4.000000 0.000000 0.000000 3.000000 10.000000 0.000000 0.000000 1.000000 0.000000 1.000000
75% 1997.000000 1.000000 80.250000 251.000000 30.000000 63.000000 10.000000 2.000000 3.000000 27.000000 2.000000 1.000000 20.000000 30.000000 1.000000 1.000000 3.000000 2.000000 5.000000
max 2015.000000 5.000000 165.000000 716.000000 192.000000 262.000000 67.000000 36.000000 73.000000 191.000000 138.000000 42.000000 232.000000 223.000000 120.000000 51.000000 67.000000 19.000000 36.000000
In [8]:
m.head()
Out[8]:
playerID birthYear birthMonth birthDay birthCountry birthState birthCity deathYear deathMonth deathDay ... nameLast nameGiven weight height bats throws debut finalGame retroID bbrefID
0 aardsda01 1981.0 12.0 27.0 USA CO Denver NaN NaN NaN ... Aardsma David Allan 215.0 75.0 R R 2004-04-06 2015-08-23 aardd001 aardsda01
1 aaronha01 1934.0 2.0 5.0 USA AL Mobile NaN NaN NaN ... Aaron Henry Louis 180.0 72.0 R R 1954-04-13 1976-10-03 aaroh101 aaronha01
2 aaronto01 1939.0 8.0 5.0 USA AL Mobile 1984.0 8.0 16.0 ... Aaron Tommie Lee 190.0 75.0 R R 1962-04-10 1971-09-26 aarot101 aaronto01
3 aasedo01 1954.0 9.0 8.0 USA CA Orange NaN NaN NaN ... Aase Donald William 190.0 75.0 R R 1977-07-26 1990-10-03 aased001 aasedo01
4 abadan01 1972.0 8.0 25.0 USA FL Palm Beach NaN NaN NaN ... Abad Fausto Andres 184.0 73.0 L L 2001-09-10 2006-04-13 abada001 abadan01

5 rows × 24 columns

In [9]:
m.columns
Out[9]:
Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID'],
      dtype='object')
In [15]:
name = m[['playerID','debut', 'finalGame','birthCountry',
       'birthState', 'birthCity', 'nameFirst', 'nameLast',
       'nameGiven']]

Tesrt asdgasdg;lkjdasfj

asdlkjfasd

In [16]:
final = a.merge(name, how='left', on='playerID')

teetewstaewetweta

In [17]:
final.head()
Out[17]:
playerID yearID stint teamID lgID G AB R H 2B ... SF GIDP debut finalGame birthCountry birthState birthCity nameFirst nameLast nameGiven
0 abercda01 1871 1 TRO NaN 1 4 0 0 0 ... NaN NaN 1871-10-21 1871-10-21 USA OK Fort Towson Frank Abercrombie Francis Patterson
1 addybo01 1871 1 RC1 NaN 25 118 30 32 6 ... NaN NaN 1871-05-06 1877-10-06 CAN ON Port Hope Bob Addy Robert Edward
2 allisar01 1871 1 CL1 NaN 29 137 28 40 4 ... NaN NaN 1871-05-04 1876-10-05 USA PA Philadelphia Art Allison Arthur Algernon
3 allisdo01 1871 1 WS3 NaN 27 133 28 44 10 ... NaN NaN 1871-05-05 1883-07-13 USA PA Philadelphia Doug Allison Douglas L.
4 ansonca01 1871 1 RC1 NaN 25 120 29 39 11 ... NaN NaN 1871-05-06 1897-10-03 USA IA Marshalltown Cap Anson Adrian Constantine

5 rows × 30 columns

In [18]:
final.columns
Out[18]:
Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'G', 'AB', 'R', 'H',
       '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH',
       'SF', 'GIDP', 'debut', 'finalGame', 'birthCountry', 'birthState',
       'birthCity', 'nameFirst', 'nameLast', 'nameGiven'],
      dtype='object')
In [19]:
new = final[['playerID','nameFirst', 'nameLast','yearID', 'stint', 'teamID', 'lgID', 'G', 'AB', 'R', 'H',
       '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'debut', 'finalGame', 'birthCountry', 'birthState',
       'birthCity']]
In [35]:
new[(new['nameFirst']=='Rick') & (new['nameLast']=='Lancellotti')]
Out[35]:
playerID nameFirst nameLast yearID stint teamID lgID G AB R ... CS BB SO IBB HBP debut finalGame birthCountry birthState birthCity
59724 lanceri01 Rick Lancellotti 1982 1 SDN NL 17 39 2 ... 0.0 2 8.0 0.0 0.0 1982-08-27 1990-08-18 USA RI Providence
63736 lanceri01 Rick Lancellotti 1986 1 SFN NL 15 18 2 ... 0.0 0 7.0 0.0 0.0 1982-08-27 1990-08-18 USA RI Providence
67954 lanceri01 Rick Lancellotti 1990 1 BOS AL 4 8 0 ... 0.0 0 3.0 0.0 0.0 1982-08-27 1990-08-18 USA RI Providence

3 rows × 26 columns

In [29]:
new.sort_values(by='HBP',ascending=False)
Out[29]:
playerID nameFirst nameLast yearID stint teamID lgID G AB R ... CS BB SO IBB HBP debut finalGame birthCountry birthState birthCity
6801 jennihu01 Hughie Jennings 1896 1 BLN NL 130 521 125 ... NaN 19 11.0 NaN 51.0 1891-06-01 1918-09-02 USA PA Pittston
49476 huntro01 Ron Hunt 1971 1 MON NL 152 520 89 ... 7.0 58 41.0 1.0 50.0 1963-04-16 1974-09-28 USA MO St. Louis
7384 jennihu01 Hughie Jennings 1898 1 BLN NL 143 534 135 ... NaN 78 NaN NaN 46.0 1891-06-01 1918-09-02 USA PA Pittston
7090 jennihu01 Hughie Jennings 1897 1 BLN NL 117 439 133 ... NaN 42 NaN NaN 46.0 1891-06-01 1918-09-02 USA PA Pittston
7433 mcganda01 Dan McGann 1898 1 BLN NL 145 535 99 ... NaN 53 NaN NaN 39.0 1896-08-08 1908-10-07 USA KY Shelbyville
5444 welchcu01 Curt Welch 1891 1 BL3 AA 132 514 122 ... NaN 77 42.0 NaN 36.0 1884-05-01 1893-05-23 USA OH Williamsport
63287 baylodo01 Don Baylor 1986 1 BOS AL 160 585 93 ... 5.0 62 111.0 8.0 35.0 1970-09-18 1988-10-01 USA TX Austin
75494 biggicr01 Craig Biggio 1997 1 HOU NL 162 619 146 ... 10.0 84 107.0 6.0 34.0 1988-06-26 2007-09-30 USA NY Smithtown
4451 tucketo01 Tommy Tucker 1889 1 BL2 AA 134 527 103 ... NaN 42 26.0 NaN 33.0 1887-04-16 1899-09-13 USA MA Holyoke
5024 welchcu01 Curt Welch 1890 1 PH4 AA 103 396 100 ... NaN 49 NaN NaN 32.0 1884-05-01 1893-05-23 USA OH Williamsport
6493 jennihu01 Hughie Jennings 1895 1 BLN NL 131 529 159 ... NaN 24 17.0 NaN 32.0 1891-06-01 1918-09-02 USA PA Pittston
12234 evansst01 Steve Evans 1910 1 SLN NL 151 506 73 ... NaN 78 63.0 NaN 31.0 1908-04-16 1915-10-03 USA OH Cleveland
75992 kendaja01 Jason Kendall 1997 1 PIT NL 144 486 71 ... 6.0 49 53.0 2.0 31.0 1996-04-01 2010-08-30 USA CA San Diego
77242 kendaja01 Jason Kendall 1998 1 PIT NL 149 535 95 ... 5.0 51 51.0 3.0 31.0 1996-04-01 2010-08-30 USA CA San Diego
85943 wilsocr03 Craig Wilson 2004 1 PIT NL 155 561 97 ... 2.0 50 169.0 3.0 30.0 2001-04-22 2007-05-11 USA CA Fountain Valley
100986 rizzoan01 Anthony Rizzo 2015 1 CHN NL 160 586 94 ... 6.0 78 105.0 9.0 30.0 2011-06-09 2016-10-02 USA FL Fort Lauderdale
5430 tucketo01 Tommy Tucker 1891 1 BSN NL 140 548 103 ... NaN 37 30.0 NaN 29.0 1887-04-16 1899-09-13 USA MA Holyoke
4112 welchcu01 Curt Welch 1888 1 PH4 AA 136 549 125 ... NaN 33 NaN NaN 29.0 1884-05-01 1893-05-23 USA OH Williamsport
3760 tucketo01 Tommy Tucker 1887 1 BL2 AA 136 524 114 ... NaN 29 NaN NaN 29.0 1887-04-16 1899-09-13 USA MA Holyoke
4912 rosemch01 Chief Roseman 1890 1 SL4 AA 80 302 47 ... NaN 30 NaN NaN 29.0 1882-05-01 1890-08-25 USA NY Brooklyn
80517 vinafe01 Fernando Vina 2000 1 SLN NL 123 487 81 ... 8.0 36 36.0 0.0 28.0 1993-04-10 2004-05-11 USA CA Sacramento
80740 biggicr01 Craig Biggio 2001 1 HOU NL 155 617 118 ... 4.0 66 100.0 4.0 28.0 1988-06-26 2007-09-30 USA NY Smithtown
5215 gilbepe01 Pete Gilbert 1891 1 BL3 AA 139 513 81 ... NaN 37 77.0 NaN 28.0 1890-09-06 1894-09-30 USA CT Baltic
74243 biggicr01 Craig Biggio 1996 1 HOU NL 162 605 113 ... 7.0 75 72.0 0.0 27.0 1988-06-26 2007-09-30 USA NY Smithtown
83399 biggicr01 Craig Biggio 2003 1 HOU NL 153 628 102 ... 4.0 57 116.0 3.0 27.0 1988-06-26 2007-09-30 USA NY Smithtown
82291 eckstda01 David Eckstein 2002 1 ANA AL 152 608 107 ... 13.0 45 44.0 0.0 27.0 2001-04-03 2010-10-03 USA FL Sanford
6192 jennihu01 Hughie Jennings 1894 1 BLN NL 128 501 134 ... NaN 37 17.0 NaN 27.0 1891-06-01 1918-09-02 USA PA Pittston
91359 utleych01 Chase Utley 2008 1 PHI NL 159 607 113 ... 2.0 64 104.0 14.0 27.0 2003-04-04 2016-10-02 USA CA Pasadena
97245 choosh01 Shin-Soo Choo 2013 1 CIN NL 154 569 107 ... 11.0 112 133.0 5.0 26.0 2005-04-21 2016-10-02 South Korea Busan Busan
48567 huntro01 Ron Hunt 1970 1 SFN NL 117 367 70 ... 2.0 44 29.0 1.0 26.0 1963-04-16 1974-09-28 USA MO St. Louis
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3386 ryanji01 Jimmy Ryan 1886 1 CHN NL 84 327 58 ... NaN 12 28.0 NaN NaN 1885-10-08 1903-09-24 USA MA Clinton
3390 seeryem01 Emmett Seery 1886 1 SL5 NL 126 453 73 ... NaN 57 82.0 NaN NaN 1884-04-17 1892-06-10 USA IL Princeville
3393 shawdu01 Dupee Shaw 1886 1 WS8 NL 45 148 13 ... NaN 14 44.0 NaN NaN 1883-06-18 1888-07-17 USA MA Charlestown
3394 shindbi01 Billy Shindle 1886 1 DTN NL 7 26 4 ... NaN 0 5.0 NaN NaN 1886-10-05 1898-09-17 USA NJ Gloucester
3395 shochge01 George Shoch 1886 1 WS8 NL 26 95 11 ... NaN 2 13.0 NaN NaN 1886-09-10 1897-10-02 USA PA Philadelphia
3396 smithbi03 Billy Smith 1886 1 DTN NL 10 38 2 ... NaN 1 14.0 NaN NaN 1886-07-06 1886-09-13 USA LA New Orleans
3399 smithph01 Phenomenal Smith 1886 1 DTN NL 3 9 0 ... NaN 0 3.0 NaN NaN 1884-08-14 1891-06-15 USA PA Philadelphia
3406 startjo01 Joe Start 1886 1 WS8 NL 31 122 10 ... NaN 5 13.0 NaN NaN 1871-05-18 1886-07-09 USA NY New York
3407 stemmbi01 Bill Stemmyer 1886 1 BSN NL 41 148 24 ... NaN 12 17.0 NaN NaN 1885-10-03 1888-06-15 USA OH Cleveland
3412 strikjo01 John Strike 1886 1 PHI NL 2 7 0 ... NaN 0 4.0 NaN NaN 1886-09-24 1886-09-30 USA PA Philadelphia
3415 sundabi01 Billy Sunday 1886 1 CHN NL 28 103 16 ... NaN 7 26.0 NaN NaN 1883-05-22 1890-10-04 USA IA Ames
3416 suttoez01 Ezra Sutton 1886 1 BSN NL 116 499 83 ... NaN 26 21.0 NaN NaN 1871-05-04 1888-06-20 USA NY Seneca Falls
3418 sweench01 Charlie Sweeney 1886 1 SL5 NL 17 64 4 ... NaN 3 10.0 NaN NaN 1882-05-11 1887-07-09 USA CA San Francisco
3421 tatepo01 Pop Tate 1886 1 BSN NL 31 106 13 ... NaN 7 17.0 NaN NaN 1885-09-26 1890-10-15 USA VA Richmond
3425 thompsa01 Sam Thompson 1886 1 DTN NL 122 503 101 ... NaN 35 31.0 NaN NaN 1885-07-02 1906-09-10 USA IN Danville
3426 titcoca01 Ledell Titcomb 1886 1 PHI NL 5 16 0 ... NaN 0 5.0 NaN NaN 1886-05-05 1890-10-15 USA ME West Baldwin
3429 twitcla01 Larry Twitchell 1886 1 DTN NL 4 16 0 ... NaN 0 2.0 NaN NaN 1886-04-30 1894-07-07 USA OH Cleveland
3431 wardjo01 John Ward 1886 1 NY1 NL 122 491 82 ... NaN 19 46.0 NaN NaN 1878-07-15 1894-09-29 USA PA Bellefonte
3434 welchmi01 Mickey Welch 1886 1 NY1 NL 59 213 17 ... NaN 7 47.0 NaN NaN 1880-05-01 1892-05-17 USA NY Brooklyn
3437 whitede01 Deacon White 1886 1 DTN NL 124 491 65 ... NaN 31 35.0 NaN NaN 1871-05-04 1890-10-04 USA NY Caton
3439 whitied01 Ed Whiting 1886 1 WS8 NL 6 21 0 ... NaN 1 12.0 NaN NaN 1882-05-02 1886-07-03 USA PA Philadelphia
3441 whitnji01 Jim Whitney 1886 1 KCN NL 67 247 25 ... NaN 29 39.0 NaN NaN 1881-05-02 1890-07-16 USA NY Conklin
3442 wiedmst01 Stump Weidman 1886 1 KCN NL 51 179 13 ... NaN 5 46.0 NaN NaN 1880-08-26 1888-07-05 USA NY Rochester
3443 willine01 Ned Williamson 1886 1 CHN NL 121 430 69 ... NaN 80 71.0 NaN NaN 1878-05-01 1890-09-27 USA PA Philadelphia
3444 winkege01 George Winkelman 1886 1 WS8 NL 1 5 0 ... NaN 0 1.0 NaN NaN 1886-08-02 1886-08-02 USA DC Washington
3445 wisebi01 Bill Wise 1886 1 WS8 NL 1 3 0 ... NaN 0 1.0 NaN NaN 1882-05-02 1886-07-17 USA DC Washington
3446 wisesa01 Sam Wise 1886 1 BSN NL 96 387 71 ... NaN 33 61.0 NaN NaN 1881-07-30 1893-09-29 USA OH Akron
3448 woodge01 George Wood 1886 1 PHI NL 106 450 81 ... NaN 23 75.0 NaN NaN 1880-05-01 1892-09-29 CAN PE Pownal
3449 yingljo01 Joe Yingling 1886 1 WS8 NL 1 2 0 ... NaN 0 1.0 NaN NaN 1886-05-28 1886-05-28 USA MD Baltimore
16223 hamilea01 Earl Hamilton 1916 1 SLA AL 1 0 0 ... NaN 0 0.0 NaN NaN 1911-04-14 1924-05-04 USA IL Gibson

102816 rows × 26 columns

In [27]:
a[a['3B']>30].sort_values(by='AB',ascending=False)
Out[27]:
playerID yearID stint teamID lgID G AB R H 2B ... RBI SB CS BB SO IBB HBP SH SF GIDP
13790 wilsoch01 1912 1 PIT NL 152 583.0 80.0 175.0 19.0 ... 95.0 16.0 NaN 35.0 67.0 NaN 2.0 23.0 NaN NaN
3352 orrda01 1886 1 NY4 AA 136 571.0 93.0 193.0 25.0 ... 91.0 16.0 NaN 17.0 NaN NaN 5.0 NaN NaN NaN
6274 reitzhe01 1894 1 BLN NL 108 446.0 86.0 135.0 22.0 ... 105.0 18.0 NaN 42.0 24.0 NaN 7.0 NaN NaN NaN

3 rows × 22 columns

In [33]:
a[['playerID','yearID','G','H']].sort_values(by='H', ascending=False)
Out[33]:
playerID yearID G H
85812 suzukic01 2004 161 262.0
18555 sislege01 1920 154 257.0
23273 odoulle01 1929 154 254.0
23924 terrybi01 1930 154 254.0
21213 simmoal01 1925 153 253.0
19379 hornsro01 1922 154 250.0
23717 kleinch01 1930 156 250.0
12708 cobbty01 1911 146 248.0
19603 sislege01 1922 142 246.0
81799 suzukic01 2001 157 242.0
22681 manushe01 1928 154 241.0
23670 hermaba01 1930 153 241.0
6682 burkeje01 1896 133 240.0
62314 boggswa01 1985 161 240.0
79602 erstada01 2000 157 240.0
54601 carewro01 1977 155 239.0
7096 keelewi01 1897 129 239.0
7627 delahed01 1899 146 238.0
63809 mattido01 1986 162 238.0
89899 suzukic01 2007 161 238.0
18823 heilmha01 1921 149 237.0
27389 medwijo01 1937 156 237.0
6129 duffyhu01 1894 125 237.0
22341 wanerpa01 1927 155 237.0
19108 tobinja01 1921 150 236.0
18840 hornsro01 1921 154 235.0
23412 wanerll01 1929 151 234.0
66042 puckeki01 1988 158 234.0
12865 jacksjo01 1911 147 233.0
8323 lajoina01 1901 131 232.0
... ... ... ... ...
78973 runyase01 1999 12 NaN
78975 ruschgl01 1999 3 NaN
78978 ryanbj01 1999 13 NaN
78979 ryanja04 1999 8 NaN
78996 santama01 1999 3 NaN
79000 saundto01 1999 9 NaN
79033 sinclst01 1999 3 NaN
79037 slocuhe01 1999 10 NaN
79045 snydejo02 1999 25 NaN
79052 sparkje01 1999 8 NaN
79056 spencse01 1999 2 NaN
79063 spradje01 1999 4 NaN
79072 steinbl01 1999 1 NaN
79073 steinbl01 1999 12 NaN
79076 steveda01 1999 5 NaN
79086 sturtta01 1999 1 NaN
79098 tamje01 1999 1 NaN
79112 tessmja01 1999 6 NaN
79154 vizcalu01 1999 1 NaN
79159 wagnepa01 1999 3 NaN
79165 wallade01 1999 8 NaN
79176 watsoal01 1999 3 NaN
79177 watsoal01 1999 21 NaN
79179 weaveer01 1999 8 NaN
79187 wellski01 1999 7 NaN
79190 wengedo01 1999 11 NaN
79192 wheelda01 1999 6 NaN
79213 willito02 1999 13 NaN
79226 wolcobo01 1999 4 NaN
79237 yarnaed01 1999 5 NaN

101332 rows × 4 columns

In [61]:
# Create data
x = a['2B']
y = a['RBI']
colors = (0,0,0)
area = np.pi*3

# Plot
plt.scatter(x, y, s=area, c=colors, alpha=0.5)
plt.title('Scatter plot pythonspot.com')
plt.xlabel('2B')
plt.ylabel('RBI')
plt.show()
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
In [ ]: